1 Preprocess

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(knitr)
library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(MuMIn)
library(lmerTest)
## 
## Attaching package: 'lmerTest'
## 
## The following object is masked from 'package:lme4':
## 
##     lmer
## 
## The following object is masked from 'package:stats':
## 
##     step
source("utils.R")
source("preprocess.R")
## Joining, by = c("item_id", "item", "condition", "first_mention",
## "recent_mention", "knowledge_cue", "start", "end")

2 Exclusions

Mturk slightly oversampled to 1161. Only 13 ppts indicated they are not native english speakers.

summarise.exclusions(ppts.all)
Reason Removed (%)
ex.native_eng 13 1.1
ex.runtime 0 0.0
—— NA NA
Total Removed 13 1.1
Retained 1143 98.9

3 Attention Checks

3.1 By participant

57% of ppts passed both attention checks.

attention %>%
  group_by(participant_id) %>%
  summarize(accuracy = mean(accuracy), .groups="drop") %>%
  group_by(accuracy) %>%
  summarize(
    n = n(),
    prop = round(n / nrow(attention), 2)
  )
accuracy n prop
0.0 297 0.13
0.5 216 0.09
1.0 630 0.28
attention %>%
  filter(is_correct == F) %>%
  group_by(is_start_or_end) %>%
  summarize(
    n = n(),
    prop = round(n / nrow(attention %>% filter(is_correct == F)), 2)
  )
is_start_or_end n prop
FALSE 606 0.75
TRUE 204 0.25

3.2 By item

Items accuracy looks normal.

attention %>%
  mutate(
    item_question_id = paste0(item, "_", question_id)
  ) %>%
  group_by(item_question_id) %>%
  summarize(accuracy = mean(accuracy), .groups="drop") %>%
  ggplot(aes(x = accuracy, y=reorder(item_question_id, -accuracy))) + 
  stat_summary(fun="mean", geom="bar")

4 Critical Trials

4.1 Distribution of trials after exclusions

The distribution by item looks fairly normal but does lead to some extreme cases (e.g. no cases for top left, TB, 12).

critical %>%
  filter(excluded.attention == F) %>%
  ggplot(aes(x = factor(item), fill=condition)) + 
  geom_bar(stat="count", position = "dodge") +
  facet_grid(cols=vars(first_mention), rows=vars(recent_mention),
             labeller = "label_both")

critical %>%
  filter(excluded.attention == F) %>%
  ggplot(aes(x = knowledge_cue, fill=condition)) + 
  geom_bar(stat="count", position = "dodge") +
  facet_grid(cols=vars(first_mention), rows=vars(recent_mention),
             labeller = "label_both")

4.2 Overall accuracy

Accuracy is 80% for ppts who passed the attention checks (and 25% for those who didn’t).

critical %>%
  ggplot(aes(x = excluded.attention, y = accuracy, fill=excluded.attention)) + 
  stat_summary(fun="mean", geom="bar") + 
  scale_fill_manual(values=c("#009933", "#FF0000"))

critical %>%
  group_by(excluded.attention) %>%
  summarize(accuracy=mean(accuracy), n=n(), .groups="drop")
excluded.attention accuracy n
FALSE 0.8047619 630
TRUE 0.2553606 513
critical %>%
  filter(excluded.attention == F) %>%
  ggplot(aes(x = condition, y = accuracy, fill=condition)) + 
  stat_summary(fun="mean", geom="bar")

critical %>%
  filter(excluded.attention == F,
         is_start | is_end) %>%
  group_by(condition) %>%
  summarize(start=mean(is_start), n=n(), .groups="drop")
condition start n
False Belief 0.8964401 309
True Belief 0.2434211 304
critical <- critical %>%
  mutate(
    accuracy = ifelse(is_correct, 1, 0)
  )

4.3 By item

None of the items look particularly easy/hard.

critical %>%
  ggplot(aes(x = reorder(item, -accuracy), y = accuracy, color=excluded.attention)) + 
  stat_summary(fun="mean", geom="point") +
  facet_grid(cols=vars(excluded.attention), labeller=label_both) + 
  scale_color_manual(values=c("#009933", "#FF0000"))

The incorrect answers from retained ppts mostly look like genuine mistakes.

critical %>%
  filter(is_correct == F,
         excluded.attention == FALSE) %>%
  select(participant_id, item_id, correct_answer, response, is_correct) %>%
  arrange(item_id)
participant_id item_id correct_answer response is_correct
1758 1_fb_1_e_s_ex box room FALSE
1893 1_fb_1_e_s_im box box but finds it is missing FALSE
415 1_tb_1_e_e_ex basket room FALSE
1411 1_tb_1_e_e_ex basket box FALSE
669 1_tb_1_e_s_ex basket box FALSE
1134 1_tb_1_e_s_ex basket box FALSE
1853 1_tb_1_s_s_im basket box FALSE
150 10_fb_1_e_e_im toolbox van FALSE
1969 10_fb_1_e_e_im toolbox van FALSE
314 10_fb_1_s_e_im toolbox van FALSE
645 10_fb_1_s_e_im toolbox van FALSE
1153 10_fb_1_s_e_im toolbox van FALSE
1409 10_fb_1_s_e_im toolbox van FALSE
1672 10_fb_1_s_s_im toolbox van FALSE
885 10_tb_1_e_e_ex van toolbox FALSE
640 10_tb_1_e_s_ex van toolbox FALSE
1425 10_tb_1_e_s_ex van toolbox FALSE
1580 10_tb_1_e_s_ex van toolbox FALSE
1844 10_tb_1_e_s_im van toolbox FALSE
914 10_tb_1_s_e_im van toolbox FALSE
1972 10_tb_1_s_e_im van toolbox FALSE
1641 11_fb_1_e_s_im suitcase backpack FALSE
1817 11_fb_1_s_e_im suitcase backpack FALSE
902 11_tb_1_s_e_ex backpack suitcase FALSE
1967 11_tb_1_s_e_im backpack suitcase FALSE
612 11_tb_1_s_s_ex backpack suitcase FALSE
1302 11_tb_1_s_s_im backpack suitcase FALSE
1423 11_tb_1_s_s_im backpack suitcase FALSE
1009 12_fb_1_e_s_im stable hut FALSE
1219 12_fb_1_s_s_im stable hut FALSE
1269 12_tb_1_e_s_im hut stable FALSE
209 12_tb_1_s_e_ex hut stable FALSE
520 12_tb_1_s_e_ex hut stable FALSE
1491 12_tb_1_s_e_ex hut stable FALSE
1837 12_tb_1_s_e_ex hut stable FALSE
1777 12_tb_1_s_s_ex hut stable FALSE
1835 2_fb_1_e_e_im cupboard sandwich FALSE
1839 2_fb_1_e_s_im cupboard fridge FALSE
339 2_tb_1_e_s_im fridge cupboard FALSE
112 2_tb_1_s_e_ex fridge cupboard FALSE
1528 2_tb_1_s_s_ex fridge cupboard FALSE
1963 2_tb_1_s_s_ex fridge cupboard FALSE
861 2_tb_1_s_s_im fridge cupboard FALSE
926 2_tb_1_s_s_im fridge cupboard FALSE
1804 2_tb_1_s_s_im fridge cupboard FALSE
159 3_fb_1_e_e_im sink basket FALSE
704 3_fb_1_s_s_ex sink stain FALSE
1416 3_tb_1_e_e_ex basket sink FALSE
247 3_tb_1_e_e_im basket sink FALSE
225 3_tb_1_e_s_ex basket sink FALSE
372 3_tb_1_s_s_ex basket sink FALSE
1298 3_tb_1_s_s_ex basket stain FALSE
1505 3_tb_1_s_s_im basket sink FALSE
1101 4_fb_1_e_e_im shed garage FALSE
518 4_fb_1_e_s_im shed garage FALSE
1947 4_fb_1_s_e_im shed garage FALSE
342 4_tb_1_e_e_ex garage shed FALSE
174 4_tb_1_e_e_im garage shed FALSE
408 4_tb_1_e_s_ex garage shed FALSE
349 4_tb_1_e_s_ex garage shed FALSE
1677 4_tb_1_e_s_ex garage shed FALSE
1417 4_tb_1_s_s_ex garage shed FALSE
2021 4_tb_1_s_s_ex garage shed FALSE
591 4_tb_1_s_s_im garage yard FALSE
2019 4_tb_1_s_s_im garage shed FALSE
371 5_fb_1_e_e_im hall study FALSE
366 5_fb_1_s_e_im hall bathroom FALSE
1464 5_fb_1_s_e_im hall study FALSE
121 5_tb_1_e_e_im study hall FALSE
1590 5_tb_1_e_s_ex study hall FALSE
1819 5_tb_1_e_s_ex study hall FALSE
309 5_tb_1_s_e_im study hall FALSE
1892 5_tb_1_s_s_ex study hall FALSE
1720 5_tb_1_s_s_im study hall FALSE
957 6_fb_1_s_e_ex drawer cabinet FALSE
1264 6_fb_1_s_e_ex drawer cabinet FALSE
258 6_fb_1_s_s_ex drawer ross FALSE
1930 6_fb_1_s_s_im drawer ross FALSE
839 6_tb_1_e_e_ex cabinet drawer FALSE
555 6_tb_1_e_e_im cabinet ross wanders FALSE
1684 6_tb_1_e_s_im cabinet drawer FALSE
178 6_tb_1_s_e_ex cabinet drawer FALSE
1077 6_tb_1_s_e_ex cabinet drawer FALSE
717 6_tb_1_s_e_im cabinet drawer FALSE
1018 6_tb_1_s_s_ex cabinet drawer FALSE
1376 7_fb_1_e_s_im garage fridge FALSE
930 7_fb_1_s_e_ex garage fridge FALSE
1667 7_fb_1_s_e_im garage fridge FALSE
183 7_fb_1_s_s_im garage fridge FALSE
1371 7_fb_1_s_s_im garage fridge FALSE
1063 7_tb_1_e_e_ex fridge garage FALSE
1253 7_tb_1_e_s_ex fridge garage FALSE
1809 7_tb_1_s_e_ex fridge garage FALSE
1928 7_tb_1_s_s_ex fridge garaage FALSE
895 8_fb_1_s_e_im hall bedroom FALSE
399 8_fb_1_s_s_ex hall bedroom FALSE
660 8_tb_1_e_e_ex bedroom good FALSE
1739 8_tb_1_e_e_ex bedroom hall FALSE
730 8_tb_1_e_e_im bedroom hall FALSE
1000 8_tb_1_e_e_im bedroom hall FALSE
941 8_tb_1_e_s_ex bedroom hall FALSE
1035 8_tb_1_e_s_ex bedroom hall FALSE
147 8_tb_1_s_e_ex bedroom hall FALSE
123 8_tb_1_s_e_im bedroom hall FALSE
574 8_tb_1_s_e_im bedroom hall FALSE
920 8_tb_1_s_e_im bedroom garden FALSE
1568 8_tb_1_s_s_ex bedroom hall FALSE
1833 8_tb_1_s_s_ex bedroom hall FALSE
1933 8_tb_1_s_s_ex bedroom hall FALSE
1806 9_fb_1_e_e_ex cupboard drawer FALSE
890 9_fb_1_e_e_im cupboard drawer FALSE
1828 9_fb_1_e_s_ex cupboard drawer FALSE
1873 9_fb_1_e_s_ex cupboard cabinet FALSE
1082 9_fb_1_s_e_im cupboard drawer FALSE
1245 9_fb_1_s_e_im cupboard drawer FALSE
566 9_tb_1_e_e_ex drawer cupboard FALSE
1715 9_tb_1_e_s_ex drawer cupboard FALSE
1124 9_tb_1_e_s_im drawer cupboard FALSE
844 9_tb_1_s_e_ex drawer cupboard FALSE
1235 9_tb_1_s_e_ex drawer cubpboard FALSE
846 9_tb_1_s_e_im drawer cupboard FALSE
1808 9_tb_1_s_s_ex drawer cupboard FALSE
230 9_tb_1_s_s_im drawer kitchen FALSE

4.4 By condition

First mention shows a noticeable effect. Effects of other vars look small.

critical %>%
  ggplot(aes(x = condition, y = accuracy, fill=excluded.attention)) + 
  stat_summary(fun="mean", geom="bar", position="dodge", alpha=0.8) + 
  stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(0.9), width=0.2) + 
  scale_fill_manual(values=c("#009933", "#FF0000"))

critical %>%
  ggplot(aes(x = knowledge_cue, y = accuracy, fill=excluded.attention)) + 
  stat_summary(fun="mean", geom="bar", position="dodge", alpha=0.8) + 
  stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(0.9), width=0.2) + 
  scale_fill_manual(values=c("#009933", "#FF0000"))

critical %>%
  ggplot(aes(x = first_mention, y = accuracy, fill=excluded.attention)) + 
  stat_summary(fun="mean", geom="bar", position="dodge", alpha=0.8) + 
  stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(0.9), width=0.2) + 
  scale_fill_manual(values=c("#009933", "#FF0000"))

critical %>%
  mutate(start = ifelse(is_start, 1, 0)) %>%
  ggplot(aes(x = first_mention, y = start, fill=excluded.attention)) + 
  stat_summary(fun="mean", geom="bar", position="dodge", alpha=0.8) + 
  stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(0.9), width=0.2) + 
  scale_fill_manual(values=c("#009933", "#FF0000"))

critical %>%
  ggplot(aes(x = recent_mention, y = accuracy, fill=excluded.attention)) + 
  stat_summary(fun="mean", geom="bar", position="dodge", alpha=0.8) + 
  stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(0.9), width=0.2) + 
  scale_fill_manual(values=c("#009933", "#FF0000"))

critical %>%
  mutate(start = ifelse(is_start, 1, 0)) %>%
  ggplot(aes(x = recent_mention, y = start, fill=excluded.attention)) + 
  stat_summary(fun="mean", geom="bar", position="dodge", alpha=0.8) + 
  stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(0.9), width=0.2) + 
  scale_fill_manual(values=c("#009933", "#FF0000"))

5 RT x Accuracy

5.1 Attention x Passage Reading Time

attention %>%
  group_by(participant_id) %>%
  summarize(
    passage_reading_time = mean(passage_reading_time),
    accuracy = mean(accuracy),
    .groups="drop"
  ) %>%
  ggplot(aes(x = passage_reading_time, y = accuracy)) + 
  # geom_point() +
  stat_summary_bin(fun.data = mean_cl_boot, geom="pointrange", binwidth = 0.1) +
  scale_x_log10() +
  geom_smooth(method="lm", formula="y~x") + 
  labs(y = "attention_accuracy")
## Warning: Removed 6 rows containing missing values (geom_segment).

5.2 Attention x Reaction Time

attention %>%
  ggplot(aes(x = reaction_time, y = accuracy)) + 
  # geom_point() +
  stat_summary_bin(fun.data = mean_cl_boot, geom="pointrange", binwidth = 0.1) +
  scale_x_log10() +
  geom_smooth(method="lm", formula="y~x") + 
  labs(y = "Attention accuracy")
## Warning: Removed 5 rows containing missing values (geom_segment).

5.3 Critical x Passage Reading Time

critical %>%
  group_by(participant_id, excluded.attention) %>%
  summarize(
    passage_reading_time = mean(passage_reading_time),
    accuracy = mean(accuracy),
    .groups="drop"
  ) %>%
  ggplot(aes(x = passage_reading_time, y = accuracy, color=excluded.attention)) + 
  # geom_point() +
  stat_summary_bin(fun.data = mean_cl_boot, geom="pointrange", binwidth = 0.2) +
  scale_x_log10() +
  geom_smooth(method="lm", formula="y~x", se=F) + 
  labs(y = "critical accuracy") +
  scale_color_manual(values=c("#009933", "#FF0000"))
## Warning: Removed 5 rows containing missing values (geom_segment).

5.4 Critical x Reaction Time

critical %>%
  ggplot(aes(x = reaction_time, y = accuracy, color=excluded.attention)) + 
  # geom_point() +
  stat_summary_bin(fun.data = mean_cl_boot, geom="pointrange", binwidth = 0.2) +
  scale_x_log10() +
  geom_smooth(method="lm", formula="y~x", se=F) + 
  labs(y = "critical accuracy") +
  scale_color_manual(values=c("#009933", "#FF0000"))
## Warning: Removed 2 rows containing missing values (geom_segment).

6 GPT-3 Accuracy

Overall GPT-3 accuracy was 0.74.

df_fb_gpt3_dv %>%
  ggplot(aes(x = condition, y = mdl.accuracy, fill=condition)) + 
  stat_summary(fun="mean", geom="bar")

df_fb_gpt3_dv %>%
  group_by(condition) %>%
  summarize(accuracy=mean(mdl.accuracy), n=n(), .groups="drop")
condition accuracy n
False Belief 0.7812500 96
True Belief 0.7083333 96

7 Pre-registered analysis 1: Does condition predict response?

First, we ask whether condition predicts response, above and beyond the other covariates excluding log_odds from GPT-3.

7.1 Descriptive statistics

Descriptively, we can ask whether a higher proportion of people respond with the START location in the FB or TB condition.

df_merged %>%
  group_by(condition, knowledge_cue) %>%
  summarise(prop_start = mean(is_start),
            count = n(),
            .groups="drop")
condition knowledge_cue prop_start count
False Belief Explicit 0.9625000 160
False Belief Implicit 0.8255034 149
True Belief Explicit 0.2974684 158
True Belief Implicit 0.1849315 146

7.2 Visualization

df_merged %>%
  ggplot(aes(x = condition,
             y = is_start_numeric,
             color = condition)) +
  # geom_jitter(alpha = .1) +
  stat_summary (fun = function(x){mean(x)},
                fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
                fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
                geom= 'pointrange', 
                position=position_dodge(width=0.95)) +
  labs(x = "Condition",
       y = "P(START)") +
  scale_color_viridis_d() +
  theme_bw() +
  facet_wrap(~knowledge_cue,
             nrow = 2)

7.3 Analysis

model_all_but_lo = glmer(
                  is_start_numeric ~ condition + knowledge_cue+
                    recent_mention + 
                    first_mention +
                    (1 + condition | item),
                  data = df_merged,
                  control=glmerControl(optimizer="bobyqa"),
                  family = binomial())

model_all_but_lo_and_condition  = glmer(
                  is_start_numeric ~ knowledge_cue+
                    recent_mention + 
                    first_mention +
                    (1 + condition | item),
                  data = df_merged,
                  control=glmerControl(optimizer="bobyqa"),
                  family = binomial())
## boundary (singular) fit: see help('isSingular')
anova(model_all_but_lo, model_all_but_lo_and_condition)
npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
model_all_but_lo_and_condition 7 567.8665 598.7950 -276.9332 553.8665 NA NA NA
model_all_but_lo 8 538.1358 573.4827 -261.0679 522.1358 31.7307 1 0

8 Pre-registered analysis 2: Does condition predict response above log-odds?

8.1 Analysis

There is a significant effect of condition when accounting for log-odds.

model_all_fe = glmer(data = df_merged,
                  is_start_numeric ~ condition + knowledge_cue + log_odds +
                    recent_mention + 
                    first_mention +
                    (1 + condition| item),
                  control=glmerControl(optimizer="bobyqa"),
                  family = binomial())


model_no_condition = glmer(data = df_merged,
                  is_start_numeric ~ knowledge_cue + log_odds +
                    recent_mention + 
                    first_mention +
                     (1 + condition| item),
                  control=glmerControl(optimizer="bobyqa"),
                  family = binomial())

anova(model_all_fe, model_no_condition)
npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
model_no_condition 8 567.8009 603.1478 -275.9005 551.8009 NA NA NA
model_all_fe 9 539.3875 579.1528 -260.6937 521.3875 30.41345 1 0

The full model shows a significant effect only for condition.

summary(model_all_fe)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## is_start_numeric ~ condition + knowledge_cue + log_odds + recent_mention +  
##     first_mention + (1 + condition | item)
##    Data: df_merged
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##    539.4    579.2   -260.7    521.4      604 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.2897 -0.4869  0.2331  0.3582  2.7109 
## 
## Random effects:
##  Groups Name                 Variance Std.Dev. Corr 
##  item   (Intercept)          0.1821   0.4268        
##         conditionTrue Belief 0.2735   0.5230   -1.00
## Number of obs: 613, groups:  item, 12
## 
## Fixed effects:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            2.45122    0.37135   6.601 4.09e-11 ***
## conditionTrue Belief  -3.37187    0.36279  -9.294  < 2e-16 ***
## knowledge_cueImplicit -0.75039    0.31428  -2.388    0.017 *  
## log_odds               0.04088    0.04694   0.871    0.384    
## recent_mentionStart    0.31638    0.22503   1.406    0.160    
## first_mentionStart    -0.02224    0.23284  -0.096    0.924    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) cndtTB knwl_I lg_dds rcnt_S
## condtnTrBlf -0.790                            
## knwldg_cImp -0.635  0.507                     
## log_odds    -0.502  0.547  0.669              
## rcnt_mntnSt -0.255 -0.106 -0.001 -0.023       
## frst_mntnSt -0.139 -0.172 -0.206 -0.267  0.035

The effect of LO approaches significance in the no_condition model.

summary(model_no_condition)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: is_start_numeric ~ knowledge_cue + log_odds + recent_mention +  
##     first_mention + (1 + condition | item)
##    Data: df_merged
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##    567.8    603.1   -275.9    551.8      605 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.3060 -0.5273  0.1979  0.3689  2.8797 
## 
## Random effects:
##  Groups Name                 Variance Std.Dev. Corr 
##  item   (Intercept)           6.809   2.609         
##         conditionTrue Belief 12.817   3.580    -1.00
## Number of obs: 613, groups:  item, 12
## 
## Fixed effects:
##                       Estimate Std. Error z value Pr(>|z|)  
## (Intercept)            0.03996    0.45424   0.088   0.9299  
## knowledge_cueImplicit -0.59347    0.31927  -1.859   0.0631 .
## log_odds               0.07446    0.05087   1.464   0.1433  
## recent_mentionStart    0.28175    0.22012   1.280   0.2006  
## first_mentionStart    -0.05596    0.23089  -0.242   0.8085  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) knwl_I lg_dds rcnt_S
## knwldg_cImp -0.170                     
## log_odds     0.046  0.702              
## rcnt_mntnSt -0.279 -0.012 -0.028       
## frst_mntnSt -0.304 -0.227 -0.287  0.044

8.2 Visualization

We can visualize this in a couple ways. First, we can look at the residuals of a model without condition, and ask they’re correlated with condition.

df_merged$resid = residuals(model_no_condition)
df_merged %>%
  ggplot(aes(x = condition,
             y = resid,
             color = condition)) +
  geom_jitter(alpha = .3) +
  stat_summary (fun = function(x){mean(x)},
                fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
                fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
                geom= 'pointrange', 
                position=position_dodge(width=0.95)) +
  labs(x = "Condition",
       y = "Residuals") +
  geom_hline(yintercept = 0, linetype = "dotted") +
  scale_color_viridis_d() +
  theme_bw()

Residuals are bimodal for all items in TB, and almost all items in FB.

df_merged$resid = residuals(model_no_condition)
df_merged %>%
  ggplot(aes(x = condition,
             y = resid,
             color = condition)) +
  geom_jitter(alpha = .3) +
  stat_summary (fun = function(x){mean(x)},
                fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
                fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
                geom= 'pointrange', 
                position=position_dodge(width=0.95)) +
  labs(x = "Condition",
       y = "Residuals") +
  geom_hline(yintercept = 0, linetype = "dotted") +
  scale_color_viridis_d() +
  theme_bw()  + 
  # facet_grid(rows=vars(knowledge_cue), cols=vars(first_mention)) + 
  facet_wrap(facets=vars(item))

Residuals area also bimodal in all intersections of first mention, recent mention, and knowledge cue, although seems to be less bimodal within false belief for kc:implicit, first_mention:end, and recent_mention:start.

df_merged$resid = residuals(model_no_condition)
df_merged %>%
  ggplot(aes(x = condition,
             y = resid,
             color = knowledge_cue)) +
  geom_jitter(alpha = .5) +
  stat_summary (fun = function(x){mean(x)},
                fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
                fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
                geom= 'pointrange', 
                position=position_dodge(width=0.95)) +
  labs(x = "Condition",
       y = "Residuals") +
  geom_hline(yintercept = 0, linetype = "dotted") +
  # scale_color_viridis_d() +
  theme_bw()  + 
  facet_grid(rows=vars(recent_mention), cols=vars(first_mention), labeller=label_both)

  # facet_wrap(facets=vars(item))

Another approach is to bin log-odds, and look at whether the probability of choosing the START location changes as a function of both binned log-odds and condition.

df_merged %>%
  mutate(binned_lo = ntile(log_odds, n = 10)) %>%
  ggplot(aes(x = binned_lo,
             y = is_start_numeric,
             color = condition)) +
  stat_summary (fun = function(x){mean(x)},
                fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
                fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
                geom= 'pointrange', 
                position=position_dodge(width=0.95)) +
  geom_smooth() +
  labs(x = "Binned Log-odds",
       y = "Residuals",
       color = "Condition") +
  scale_color_viridis_d() +
  theme_bw() 
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

9 LO vs Base

9.1 Analysis

model_no_condition = glmer(data = df_merged,
                  is_start_numeric ~ knowledge_cue + log_odds +
                    recent_mention + 
                    first_mention +
                     (1 | item),
                  # control=glmerControl(optimizer="bobyqa"),
                  family = binomial())

model_all_but_lo_and_condition  = glmer(
                  is_start_numeric ~ knowledge_cue+
                    recent_mention + 
                    first_mention +
                    (1 | item),
                  data = df_merged,
                  control=glmerControl(optimizer="bobyqa"),
                  family = binomial())

anova(model_no_condition, model_all_but_lo_and_condition)
npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
model_all_but_lo_and_condition 5 836.6595 858.7513 -413.3298 826.6595 NA NA NA
model_no_condition 6 676.0707 702.5809 -332.0354 664.0707 162.5888 1 0

10 Ixn

model_all_fe_ixn = glmer(data = df_merged,
                  is_start_numeric ~ condition * knowledge_cue + log_odds +
                    recent_mention + 
                    first_mention +
                    (1 + condition| item),
                  # control=glmerControl(optimizer="bobyqa"),
                  family = binomial())
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00874747 (tol = 0.002, component 1)
model_all_fe = glmer(data = df_merged,
                  is_start_numeric ~ condition + knowledge_cue + log_odds +
                    recent_mention + 
                    first_mention +
                    (1 + condition| item),
                  # control=glmerControl(optimizer="bobyqa"),
                  family = binomial())

anova(model_all_fe_ixn, model_all_fe)
npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
model_all_fe 9 539.3875 579.1528 -260.6937 521.3875 NA NA NA
model_all_fe_ixn 10 535.8245 580.0082 -257.9123 515.8245 5.562938 1 0.0183446
summary(model_all_fe_ixn)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## is_start_numeric ~ condition * knowledge_cue + log_odds + recent_mention +  
##     first_mention + (1 + condition | item)
##    Data: df_merged
## 
##      AIC      BIC   logLik deviance df.resid 
##    535.8    580.0   -257.9    515.8      603 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.7277 -0.5524  0.1716  0.3926  2.5634 
## 
## Random effects:
##  Groups Name                 Variance Std.Dev. Corr 
##  item   (Intercept)          0.1820   0.4266        
##         conditionTrue Belief 0.2873   0.5360   -1.00
## Number of obs: 613, groups:  item, 12
## 
## Fixed effects:
##                                            Estimate Std. Error z value Pr(>|z|)
## (Intercept)                                 2.98773    0.48509   6.159 7.32e-10
## conditionTrue Belief                       -4.05664    0.50775  -7.990 1.35e-15
## knowledge_cueImplicit                      -1.52778    0.49231  -3.103  0.00191
## log_odds                                    0.06321    0.04650   1.359  0.17408
## recent_mentionStart                         0.33454    0.22508   1.486  0.13720
## first_mentionStart                         -0.05278    0.23143  -0.228  0.81959
## conditionTrue Belief:knowledge_cueImplicit  1.25756    0.55584   2.262  0.02367
##                                               
## (Intercept)                                ***
## conditionTrue Belief                       ***
## knowledge_cueImplicit                      ** 
## log_odds                                      
## recent_mentionStart                           
## first_mentionStart                            
## conditionTrue Belief:knowledge_cueImplicit *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) cndtTB knwl_I lg_dds rcnt_S frst_S
## condtnTrBlf -0.875                                   
## knwldg_cImp -0.806  0.777                            
## log_odds    -0.273  0.255  0.287                     
## rcnt_mntnSt -0.175 -0.100 -0.026 -0.018              
## frst_mntnSt -0.135 -0.086 -0.093 -0.266  0.028       
## cndtnTBl:_I  0.594 -0.686 -0.759  0.182  0.036 -0.051
## optimizer (Nelder_Mead) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 0.00874747 (tol = 0.002, component 1)

11 Demographics

ppts.all %>%
  group_by(dyslexia, adhd, asd) %>%
  summarize(n = n(),
            prop = round(n / nrow(ppts.all), 2),
            .groups="drop")
dyslexia adhd asd n prop
False False False 822 0.71
False False True 39 0.03
False True False 149 0.13
False True True 12 0.01
True False False 86 0.07
True True False 16 0.01
True True True 32 0.03
ppts.all %>%
  group_by(dyslexia, adhd, asd) %>%
  summarize(n = n(),
            prop = round(n / nrow(ppts.all), 2),
            .groups="drop") %>%
  ggplot(aes(x = dyslexia, y = n, fill=adhd)) + 
  geom_bar(stat="identity", position="dodge") + 
  facet_grid(cols=vars(asd), labeller="label_both") + 
  theme_minimal()

11.1 Accuracy by Neurological Condition

df_merged <- df_merged %>%
  merge(ppts.all, by.y="id", by.x="participant_id", all.y = F)

df_merged %>%
  group_by(dyslexia, adhd, asd) %>%
  summarize(n = n(),
            correct = sum(accuracy),
            accuracy = mean(accuracy),
            .groups="drop")
dyslexia adhd asd n correct accuracy
False False False 530 452 0.8528302
False False True 10 6 0.6000000
False True False 49 31 0.6326531
False True True 4 3 0.7500000
True False False 16 11 0.6875000
True True False 1 1 1.0000000
True True True 3 3 1.0000000

Dyslexic participants perform worse.

critical <- critical %>%
  merge(ppts.all %>% select(participant_id, dyslexia, adhd, asd, age, gender))

critical %>%
  ggplot(aes(x = dyslexia, y = accuracy, fill=excluded.attention)) + 
  stat_summary(fun="mean", geom="bar", position="dodge", alpha=0.8) + 
  stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(0.9), width=0.2) + 
  scale_fill_manual(values=c("#009933", "#FF0000"))

As do ppts with ADHD

critical %>%
  ggplot(aes(x = adhd, y = accuracy, fill=excluded.attention)) + 
  stat_summary(fun="mean", geom="bar", position="dodge", alpha=0.8) + 
  stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(0.9), width=0.2) + 
  scale_fill_manual(values=c("#009933", "#FF0000"))

And ASD

critical %>%
  ggplot(aes(x = asd, y = accuracy, fill=excluded.attention)) + 
  stat_summary(fun="mean", geom="bar", position="dodge", alpha=0.8) + 
  stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(0.9), width=0.2) + 
  scale_fill_manual(values=c("#009933", "#FF0000"))

Men and women perform similarly.

critical %>%
  ggplot(aes(x = gender, y = accuracy, fill=excluded.attention)) + 
  stat_summary(fun="mean", geom="bar", position="dodge", alpha=0.8) + 
  stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(0.9), width=0.2) + 
  scale_fill_manual(values=c("#009933", "#FF0000"))

Ppts get better with age.

critical %>%
  # group_by(participant_id, excluded.attention) %>%
  # summarize(
  #   age = mean(age),
  #   accuracy = mean(accuracy),
  #   .groups="drop"
  # ) %>%
  ggplot(aes(x = age, y = accuracy, color=excluded.attention)) + 
  # geom_point() +
  stat_summary_bin(fun.data = mean_cl_boot, geom="pointrange", binwidth = 0.05) +
  scale_x_log10() +
  geom_smooth(method="lm", formula="y~x", se=F) + 
  labs(y = "critical accuracy") +
  scale_color_manual(values=c("#009933", "#FF0000"))
## Warning: Removed 1 rows containing missing values (geom_segment).

11.2 Linear Models

11.3 ASD

Negative interaction of LO and ASD (b=-0.48, p=0.13)

model.asd = glmer(data = df_merged,
                  is_start_numeric ~ condition + knowledge_cue + log_odds +
                    recent_mention + first_mention + asd + asd:log_odds +
                    (1 + condition| item),
                  control=glmerControl(optimizer="bobyqa"),
                  family = binomial())

summary(model.asd)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## is_start_numeric ~ condition + knowledge_cue + log_odds + recent_mention +  
##     first_mention + asd + asd:log_odds + (1 + condition | item)
##    Data: df_merged
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##    536.1    584.7   -257.0    514.1      602 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.3242 -0.4723  0.2283  0.3571  2.9976 
## 
## Random effects:
##  Groups Name                 Variance Std.Dev. Corr 
##  item   (Intercept)          0.1796   0.4238        
##         conditionTrue Belief 0.2790   0.5282   -0.97
## Number of obs: 613, groups:  item, 12
## 
## Fixed effects:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            2.40542    0.37051   6.492 8.46e-11 ***
## conditionTrue Belief  -3.38406    0.36203  -9.348  < 2e-16 ***
## knowledge_cueImplicit -0.72545    0.31843  -2.278   0.0227 *  
## log_odds               0.05554    0.04803   1.156   0.2475    
## recent_mentionStart    0.33808    0.22765   1.485   0.1375    
## first_mentionStart    -0.04544    0.23564  -0.193   0.8471    
## asdTrue                1.32252    0.67657   1.955   0.0506 .  
## log_odds:asdTrue      -0.18522    0.14063  -1.317   0.1878    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) cndtTB knwl_I lg_dds rcnt_S frst_S asdTru
## condtnTrBlf -0.779                                          
## knwldg_cImp -0.634  0.503                                   
## log_odds    -0.488  0.525  0.659                            
## rcnt_mntnSt -0.258 -0.110  0.000 -0.015                     
## frst_mntnSt -0.143 -0.167 -0.209 -0.267  0.032              
## asdTrue     -0.031 -0.034  0.031  0.019  0.034 -0.074       
## lg_dds:sdTr  0.011 -0.006  0.016 -0.143 -0.030 -0.018  0.192

Significant negative interaction of LO and Dyslexia (b=-0.69, p=0.03)

model.dyslexia = glmer(data = df_merged,
                  is_start_numeric ~ condition + knowledge_cue + log_odds +
                    recent_mention + first_mention + dyslexia + dyslexia:log_odds +
                    (1 + condition| item),
                  control=glmerControl(optimizer="bobyqa"),
                  family = binomial())

summary(model.dyslexia)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## is_start_numeric ~ condition + knowledge_cue + log_odds + recent_mention +  
##     first_mention + dyslexia + dyslexia:log_odds + (1 + condition |      item)
##    Data: df_merged
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##    538.1    586.7   -258.1    516.1      602 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.5784 -0.4682  0.2263  0.3653  2.7943 
## 
## Random effects:
##  Groups Name                 Variance Std.Dev. Corr 
##  item   (Intercept)          0.1922   0.4384        
##         conditionTrue Belief 0.2408   0.4907   -0.96
## Number of obs: 613, groups:  item, 12
## 
## Fixed effects:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            2.45824    0.37879   6.490  8.6e-11 ***
## conditionTrue Belief  -3.37323    0.36295  -9.294  < 2e-16 ***
## knowledge_cueImplicit -0.74579    0.32249  -2.313   0.0207 *  
## log_odds               0.05664    0.04865   1.164   0.2443    
## recent_mentionStart    0.30900    0.22676   1.363   0.1730    
## first_mentionStart    -0.04165    0.23547  -0.177   0.8596    
## dyslexiaTrue           0.30779    0.65140   0.473   0.6366    
## log_odds:dyslexiaTrue -0.30561    0.13464  -2.270   0.0232 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) cndtTB knwl_I lg_dds rcnt_S frst_S dyslxT
## condtnTrBlf -0.788                                          
## knwldg_cImp -0.638  0.522                                   
## log_odds    -0.490  0.541  0.669                            
## rcnt_mntnSt -0.254 -0.108 -0.001 -0.024                     
## frst_mntnSt -0.131 -0.180 -0.215 -0.277  0.041              
## dyslexiaTru -0.150  0.108  0.114  0.084  0.012 -0.071       
## lg_dds:dysT -0.054  0.043  0.011 -0.148  0.011  0.034 -0.119

NS positive interaction of LO and ADHD (b=0.42, p=0.2)

model.adhd = glmer(data = df_merged,
                  is_start_numeric ~ condition + knowledge_cue + log_odds +
                    recent_mention + first_mention + adhd + adhd:log_odds +
                    (1 + condition| item),
                  control=glmerControl(optimizer="bobyqa"),
                  family = binomial())

summary(model.adhd)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## is_start_numeric ~ condition + knowledge_cue + log_odds + recent_mention +  
##     first_mention + adhd + adhd:log_odds + (1 + condition | item)
##    Data: df_merged
## Control: glmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##    539.7    588.3   -258.8    517.7      602 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.2656 -0.4773  0.2295  0.3601  2.8388 
## 
## Random effects:
##  Groups Name                 Variance Std.Dev. Corr 
##  item   (Intercept)          0.1992   0.4463        
##         conditionTrue Belief 0.2906   0.5391   -0.99
## Number of obs: 613, groups:  item, 12
## 
## Fixed effects:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            2.376816   0.375413   6.331 2.43e-10 ***
## conditionTrue Belief  -3.376603   0.366220  -9.220  < 2e-16 ***
## knowledge_cueImplicit -0.729177   0.316779  -2.302   0.0213 *  
## log_odds               0.046575   0.048214   0.966   0.3340    
## recent_mentionStart    0.314904   0.225923   1.394   0.1634    
## first_mentionStart    -0.003348   0.235015  -0.014   0.9886    
## adhdTrue               0.715835   0.397925   1.799   0.0720 .  
## log_odds:adhdTrue     -0.064117   0.103996  -0.617   0.5375    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) cndtTB knwl_I lg_dds rcnt_S frst_S adhdTr
## condtnTrBlf -0.783                                          
## knwldg_cImp -0.634  0.505                                   
## log_odds    -0.483  0.531  0.653                            
## rcnt_mntnSt -0.252 -0.107 -0.002 -0.026                     
## frst_mntnSt -0.132 -0.183 -0.210 -0.267  0.034              
## adhdTrue    -0.097 -0.031  0.030 -0.012  0.002  0.042       
## lg_dds:dhdT -0.024  0.010  0.025 -0.196  0.012 -0.009  0.105

12 # Visualizations

13 Exploratory Visualizations

df_merged_summ = df_merged %>%
  mutate(p_start_cond = 1/(1 + exp(-log_odds))) %>%
  group_by(item_id, condition, knowledge_cue, recent_mention, first_mention) %>%
  summarise(prop_start = mean(is_start),
            lo = mean(log_odds),
            accuracy = mean(is_correct),
            p_start_gpt3 = mean(p_start_cond),
            .groups="drop")

13.1 Density: LO vs P(Start)

df_merged_summ %>%
  mutate("GPT-3\n(Proportion)" = p_start_gpt3,
         "Human\n(Proportion)" = prop_start) %>%
  pivot_longer(cols = c("Human\n(Proportion)", "GPT-3\n(Proportion)"),
               names_to = "metric",
               values_to = "value") %>%
  ggplot(aes(x = value,
             fill = condition)) +
  geom_density(alpha = .5, color="#666666") +
  theme_minimal() +
  facet_wrap(. ~ metric,
             # scales = "free",
             ncol=1,
             strip.position = "left") + 
  geom_vline(xintercept = .5, linetype = "dotted") +
  theme(
    legend.position = "bottom"
  ) + 
  scale_y_continuous(position="right") + 
  labs(
    fill = "Knowledge State",
    x = "P(Start)",
    y = "Density"
  ) +
  theme(axis.title = element_text(size=rel(2)),
        axis.text = element_text(size = rel(2)),
        legend.text = element_text(size = rel(2)),
        legend.title = element_text(size = rel(2)),
        strip.text.y = element_text(size = rel(2)))

13.2 Accuracy vs LO-correct

df_merged_summ %>%
  mutate(
    lo.correct = case_when(
      condition == "False Belief" ~ lo,
      T ~ -1 * lo,
    )
  ) %>%
  ggplot(aes(x = lo.correct, y = accuracy, color=condition, fill=condition)) + 
  geom_point(position=position_jitter(height=0.01), alpha=0.75) + 
  geom_smooth(method="lm", formula="y~x", alpha=0.15) + 
  theme_minimal() +
  labs(
    y = "Human Accuracy",
    x = "GPT-3 Log-odds Ratio (Correct - Incorrect)",
    fill = "Knowledge State",
    color = "Knowledge State"
  ) + 
  theme(
    legend.position = "top"
  )

13.3 R Squared

r2 <- c(
r.squaredGLMM(model_all_but_lo_and_condition)[1],
r.squaredGLMM(model_no_condition)[1],
r.squaredGLMM(model_all_but_lo)[1],
r.squaredGLMM(model_all_fe)[1])
## Warning: 'r.squaredGLMM' now calculates a revised statistic. See the help page.
## Warning: the null model is correct only if all variables used by the original
## model remain unchanged.

## Warning: the null model is correct only if all variables used by the original
## model remain unchanged.

## Warning: the null model is correct only if all variables used by the original
## model remain unchanged.

## Warning: the null model is correct only if all variables used by the original
## model remain unchanged.
model <- c(
  "Base",
  "Base + GPT-3",
  "Base + Condition",
  "Base + GPT-3 + Condition"
)

df.r2 <- data.frame(model, r2)

df.r2 %>%
  ggplot(aes(x = r2, y = reorder(model, -r2))) + 
  geom_bar(stat="identity", fill = "#69c8ff") + 
  theme_minimal() + 
  labs(
    x = bquote("Marginal"~R^2~""),
    y = "Predictors"
  )

14 Token Generation

14.1 Model scale plot

tga <- read.csv("token_generation_accuracy.csv")

tga
Model Accuracy
text-babbage-001 0.3177083
text-ada-001 0.3437500
text-curie-001 0.4166667
babbage 0.4375000
ada 0.4427083
curie 0.4583333
davinci 0.6406250
text-davinci-002 0.6979167
## Density version

tga %>%
  ggplot(aes(x = reorder(Model, Accuracy), y=Accuracy, fill=Model)) + 
  geom_bar(stat="identity") + 
  labs(x = "Model",
       y = "Accuracy",
       fill = "Knowledge State") +
  theme_minimal() +
  scale_fill_viridis_d() +
  theme(
    legend.position = "none"
  ) + 
  theme(axis.title = element_text(size=rel(1.5)),
        axis.text.x = element_text(size = rel(1.5), angle=40, hjust=1),
        axis.text.y = element_text(size = rel(1.5)),
        legend.text = element_text(size = rel(1.5)),
        legend.title = element_text(size = rel(1.5)),
        strip.text.x = element_text(size = rel(1.5)))

tga %>%
  ggplot(aes(y = reorder(Model, Accuracy), x=Accuracy, fill=Model)) + 
  geom_bar(stat="identity") + 
  labs(x = "Model",
       y = "Accuracy",
       fill = "Knowledge State") +
  theme_minimal() +
  scale_fill_viridis_d() +
  theme(
    legend.position = "none"
  ) + 
  coord_cartesian(xlim=c(0,1)) +
  geom_vline(xintercept=0.83, linetype="dashed", color="#ff0000") +
  theme(axis.title = element_text(size=rel(1.5)),
        axis.text.x = element_text(size = rel(1.7)),
        axis.text.y = element_text(size = rel(1.7)),
        legend.text = element_text(size = rel(1.5)),
        legend.title = element_text(size = rel(1.5)),
        strip.text.x = element_text(size = rel(1.5)))

ggsave("../Figures/textgen-model-acc.pdf", dpi = 300, width=7, height=5)